* Firm organization with multiple establishments
* Section V: Table D.17 data preparations 2000-2010 data

clear all 
set matsize 2000
set more off

capture log close
log using log/23_train_layer-est_data.log, replace

/*
CONTENTS:
Data preparation to study impact of high-speed train routes on multi-establishment firms

	Establishment panel data
	Adjust establishment dynamics
	Adjust establishment location
	Merge firm characteristics
	Merge sector characteristics
	Merge data on trains
	Variable definition
*/

********************************************************************************
***	ESTABLISHMENT PANEL DATA

use data/Panel.dta if jahr >= 2000 & jahr <= 2010, clear

sort betnr jahr
egen flg_estjhr = tag(betnr jahr)
egen flg_untjhr = tag(untid jahr) 

***	Restrict sample to multi-establishment firms
bys untid jahr: egen count_est = total(flg_estjhr)
bys untid: egen ever_ME = max(count_est > 1)
keep if ever_ME == 1 
drop ever_ME

***	Restrict sample to firms with at least 10 employees every year
bys betnr jahr: egen empl_bet = count(persnr)
bys untid jahr: egen empl_unt = count(persnr)
bys untid: egen sample = min(empl_unt >= 10)
keep if sample == 1
drop sample

merge m:1 betnr jahr using data/BHPinclUntID.dta, keep(3)
drop _merge

//	Number of managerial layers
bys betnr jahr: egen lowest = min(layer)
bys betnr jahr: egen second_lowest = min(layer) if layer > lowest
bys betnr jahr: egen third_lowest = min(layer) if layer > second_lowest
bys betnr jahr: egen highest = min(layer) if layer > third_lowest

qui gen layer_rank = .
qui replace layer_rank = 0 if layer == lowest
qui replace layer_rank = 1 if layer == second_lowest & layer_rank == .
qui replace layer_rank = 2 if layer == third_lowest & layer_rank == .
qui replace layer_rank = 3 if layer == highest & layer_rank == .
drop lowest *lowest highest

egen flg_untjhrlay = tag(untid jahr layer_rank)
qui replace flg_untjhrlay = 0 if layer_rank == 0
bys untid jahr: egen count_mgmt_unt = total(flg_untjhrlay)
drop flg_untjhrlay

egen flg_estjhrlay = tag(betnr jahr layer_rank)
replace flg_estjhrlay = 0 if layer_rank == 0
bys betnr jahr: egen count_mgmt_bet = total(flg_estjhrlay)
drop flg_estjhrlay

//	Management share (employment)
gen mgmt_lay = (layer_rank > 0)
bys betnr jahr: egen nbr_mgmt_bet = total(mgmt_lay)
bys untid jahr: egen nbr_mgmt_unt = total(mgmt_lay)
gen shr_mgmt_bet = (nbr_mgmt_bet / empl_bet) * 100
gen shr_mgmt_unt = (nbr_mgmt_unt / empl_unt) * 100
gen prdt_wkrs_unt = empl_unt - nbr_mgmt_unt
gen prdt_wkrs_bet = empl_bet - nbr_mgmt_bet
drop nbr_mgmt*

//	Management share (wage sum)
bys untid jahr: egen wage_sum_unt = total(tentgelt)
bys untid jahr: egen wage_sum_mgmt = total(mgmt_lay * tentgelt)
gen shr_mgmt_w_unt = (wage_sum_mgmt / wage_sum_unt) * 100
bys betnr jahr: egen wage_sum_bet = total(tentgelt)
bys betnr jahr: egen wage_sum_mgmt_bet = total(mgmt_lay * tentgelt)
gen shr_mgmt_w_bet = (wage_sum_mgmt_bet / wage_sum_bet) * 100

//	Management share (Blossfeld)
merge m:1 beruf using data/KldB1988_Blossfeld.dta
drop if _merge == 2
drop _merge

gen d_mgr = (blossfeld == 12)
bys betnr jahr: egen shr_mgmt_bet_bloss = mean(d_mgr * 100)
bys untid jahr: egen shr_mgmt_unt_bloss = mean(d_mgr * 100)
bys untid jahr: egen wage_unt_bloss = total(d_mgr * tentgelt)
gen shr_bloss_w_unt = (wage_unt_bloss / wage_sum_unt) * 100
bys betnr jahr: egen wage_bet_bloss = total(d_mgr * tentgelt)
gen shr_bloss_w_bet = (wage_bet_bloss / wage_sum_bet) * 100

//	Wages
cap drop lnw
gen lnw = ln(tentgelt)

bys untid jahr: egen lowest_layer_firm = min(layer_rank)
bys betnr jahr: egen lowest_layer_est = min(layer_rank)

// Firm-level
bys untid jahr: egen avg_lnw_unt = mean(lnw)
// Establishment-level
bys betnr jahr: egen avg_lnw_bet = mean(lnw)

// Bottom-layers
	// Firm-level
bys untid jahr layer_rank: egen aux_avg_lnw_lay_unt = mean(lnw)
replace aux_avg_lnw_lay_unt = . if layer_rank > lowest_layer_firm
bys untid jahr: egen avg_lnw_bot_unt = max(aux_avg_lnw_lay_unt)
drop aux*

	// Establishment-level
bys betnr jahr layer_rank: egen aux_avg_lnw_lay_bet = mean(lnw)
replace aux_avg_lnw_lay_bet = . if layer_rank > lowest_layer_est
bys betnr jahr: egen avg_lnw_bot_bet = max(aux_avg_lnw_lay_bet)
drop aux*

keep untid jahr betnr hq_* hauptbet w* ao_* d_* prdt* count_* empl* shr_* Recht* avg_lnw* 
order untid jahr prdt_wkrs_unt count_est count_mgmt_unt Rechtsform_neu hq_kreis hq_sector betnr prdt_wkrs_bet count_mgmt_bet hauptbet ao_kreis w08_5 w93_3_gen

egen flg_estjhr = tag(betnr jahr)
keep if flg_estjhr == 1
drop flg_estjhr

compress 
save data/temp_train_est.dta, replace

********************************************************************************
***	ADJUST ESTABLISHMENT DYNAMICS

use data/temp_train_est.dta, clear

merge 1:1 betnr jahr using data/bhp_7514_auslong_trains.dta, keep(1 3)

gen double betnr_orig = betnr

qui gen long suc = betnr_nach0 if betnr_nach0 != . & betnr_nach1 == .
bys betnr: egen long successor = max(suc)
qui replace betnr = successor if successor != .

cap drop test
bys untid betnr jahr: gen test = _N
tab test, m

replace betnr = betnr_orig if test == 2
cap drop test
bys untid betnr jahr: gen test = _N
tab test, m

qui gen long suc1 = betnr_nach1 if betnr_nach1 != . 
bys betnr: egen long successor1 = max(suc1)
qui replace betnr = successor1 if successor1 != .

cap drop test
bys untid betnr jahr: gen test = _N
tab test, m

drop untid0 austritt0 betnr_nach0 jahr1 austritt1 betnr_nach1 jahr2 austritt2 untid_nach1 betnr_nach2 untid_nach2 _merge suc successor suc1 successor1 test

merge m:1 betnr using data/bhp_estlife.dta
drop if _merge == 2
drop _merge

replace hauptbet = 0 if count_est == 1

********************************************************************************
***	ADJUST ESTABLISHMENT LOCATION

//	Establishments and headquarters sometimes move
//	Main analyses: assign hq_kreis/ao_kreis in first year to all years
//	Robustness: Drop movers

gen ao_kreis_orig = ao_kreis 
label values ao_kreis_orig ao_kreis_de
gen hq_kreis_orig = hq_kreis 
label values hq_kreis_orig ao_kreis_de

sort betnr ao_kreis jahr
egen flg_betao = tag(betnr ao_kreis)
egen count_betao = total(flg_betao), by(betnr)
egen flg_est = tag(betnr)
tab count_betao if flg_est == 1, m sort

sort betnr hauptbet
egen flg_haupt = tag(betnr) if hauptbet == 1
tab count_betao if flg_haupt == 1, m sort

egen firstjhr_bet = min(jahr), by(betnr)
gen aux = ao_kreis if jahr == firstjhr_bet
bys betnr: egen aux2 = max(aux) 
tab count_betao if ao_kreis != aux2 
// Reason: missing county information
replace ao_kreis = aux2 if count_betao > 1 & ao_kreis != aux2 & aux2 != .
drop flg_betao count_betao aux aux2

//	hq_kreis needs to be assigned at untid level
sort betnr hq_kreis jahr
egen flg_bethq = tag(betnr hq_kreis)
egen count_bethq = total(flg_bethq), by(betnr)
tab count_bethq if flg_est == 1, m sort
tab count_bethq if flg_haupt == 1, m sort

bys untid: egen firstjhr_unt = min(jahr) if count_est > 1
gen aux = hq_kreis if jahr == firstjhr_unt
bys untid: egen aux2 = max(aux) 
tab count_bethq if hq_kreis != aux2
tab count_bethq if hq_kreis != aux2 & count_est > 1 
//	Reason: establishments that are added later
replace hq_kreis = aux2 if hq_kreis != aux2 & aux2 != . & count_est > 1
drop flg_bethq count_bethq flg_haupt aux aux2

count if ao_kreis != ao_kreis_orig
count if hq_kreis != hq_kreis_orig

count if count_est == 1 & ao_kreis != hq_kreis
replace hq_kreis = ao_kreis if count_est == 1 & ao_kreis != hq_kreis

//	How many betnr move across space?
bys betnr: egen min_kreis = min(ao_kreis_orig)
by  betnr: egen aux = max(min_kreis != ao_kreis_orig)
tab aux if flg_est == 1
drop min_kreis 
bys untid: egen issue_unt = max(aux)
label variable issue_unt "Indicator: at least one establishment moves (firm)"
sort untid jahr
egen flg_unt = tag(untid)
tab issue if flg_unt == 1
rename aux issue_est
label variable issue_est "Indicator: establishment moves"

//	How many hauptbet move across space?
bys untid: egen min_kreis = min(hq_kreis_orig) if count_est > 1
bys betnr: egen aux = max(min_kreis != hq_kreis_orig & min_kreis != .) 
tab aux hauptbet if flg_est == 1
tab aux hauptbet if flg_est == 1 & issue_est == 1
rename aux issue_est_haupt
label variable issue_est_haupt "Indicator: HQ of establishment moves"
drop min_kreis

save data/train_pre_est.dta, replace

********************************************************************************
***	MERGE SECTOR CHARACTERISTICS

//	Use establishment sector to assign sector characteristics: 
//	Use sector from first period (otherwise, establishment FE and sector FE are not collinear)

use data/train_pre_est.dta, clear

bys betnr (jahr): egen min_yr = min(jahr)
gen aux = w93_3_gen if jahr == min_yr
by  betnr: egen br1 = min(aux)
count if br1 != w93_3_gen & jahr == min_yr
drop min_yr aux

//	Problem arrival rate
replace br1 = int(br1/10)
merge m:1 br1 using data/Covariates_sector.dta
drop if _merge == 2
drop _merge

********************************************************************************
***	MERGE DATA ON TRAINS

merge m:1 hq_kreis ao_kreis using data/traveltimes_thust_wide_2018082.dta
drop if _merge == 2
gen d_traindata = (_merge == 3)
label variable d_traindata "Indicator (est): hq_kreis and ao_kreis connected"
drop _merge

egen flg_untjhr = tag(untid jahr)

bys untid jahr: egen e_traindata = max(d_traindata)
label variable e_traindata "Indicator (firm): HQ and at least one est connected to ICE, not same county"

gen ln_empl_unt = log(empl_unt)

*	Statistics
foreach v in 2001 2004 2008 {
	qui gen br_time_mean`v' = mean_travel`v' + (mean_chge`v' * 30) 
	qui gen br_time_min`v' = min_travel`v' + (min_chge`v' * 30) 
	}

qui gen br_diff_0004_2 = br_time_mean2004 - br_time_mean2001
qui gen br_diff_0408_2 = br_time_mean2008 - br_time_mean2004
qui gen br_diff_0008_2 = br_time_mean2008 - br_time_mean2001

qui gen br_diff_0004_min = br_time_min2004 - br_time_min2001
qui gen br_diff_0408_min = br_time_min2008 - br_time_min2004
qui gen br_diff_0008_min = br_time_min2008 - br_time_min2001

count if br_diff_0004_2 == .
count if br_diff_0408_2 == .

********************************************************************************
***	VARIABLE DEFINITION 

***	TREATMENT DUMMIES

bys betnr: egen entryyear = min(jahr)
gen grd_jahr = year(grd_dat)
gen delta = entryyear - grd_jahr
qui replace entryyear = grd_jahr if entryyear == 2000 & grd_jahr != . & grd_jahr < entryyear
qui replace entryyear = grd_jahr if entryyear != grd_jahr & delta == 1
drop delta

//	FFM-COL
qui gen faster1 = (br_diff_0004_min <= -30 & br_diff_0008_min <= -30 & jahr >= 2004 & entryyear < 2002) // baseline
qui gen faster1_rob = (br_diff_0004_min <= -30 & br_diff_0008_min <= -30 & jahr >= 2004 & entryyear < 2002) // robustness: no HH-Berlin
qui gen faster1_direct = (br_diff_0004_min <= -30 & br_diff_0008_min <= -30 & jahr >= 2004 & entryyear < 2002) // robustness: only direct connections
qui replace faster1_direct = 0 if min_chge2004 > 0 & min_chge2008 > 0 

//	ING-NUR, HH-BER
qui replace faster1 = 1 if (br_diff_0408_min <= -30 & jahr >= 2008 & d_HB2008 == 1 & entryyear < 2004) 
qui replace faster1 = 1 if (br_diff_0408_min <= -30 & jahr >= 2008 & d_HB2008 == 0 & entryyear < 2006) 
qui replace faster1_rob = 1 if (br_diff_0408_min <= -30 & jahr >= 2008 & d_HB2008 == 0 & entryyear < 2006) // Exclude treatment through HH-Berlin connection
qui replace faster1_direct = 1 if (br_diff_0408_min <= -30 & jahr >= 2008 & d_HB2008 == 1 & entryyear < 2004) 
qui replace faster1_direct = 1 if (br_diff_0408_min <= -30 & jahr >= 2008 & d_HB2008 == 0 & entryyear < 2006) 
qui replace faster1_direct = 0 if min_chge2008 > 0 

//	New stations established on FFM-COL route
qui replace faster1 = 1 if br_time_mean2001 == . & jahr >= 2004 & entryyear < 2002 & (ao_kreis == 6533 | hq_kreis == 6533) & ao_kreis != hq_kreis
qui replace faster1_rob = 1 if br_time_mean2001 == . & jahr >= 2004 & entryyear < 2002 & (ao_kreis == 6533 | hq_kreis == 6533) & ao_kreis != hq_kreis
qui replace faster1_direct = 1 if br_time_mean2001 == . & jahr >= 2004 & entryyear < 2002 & (ao_kreis == 6533 | hq_kreis == 6533) & ao_kreis != hq_kreis & min_chge2008 == 0

***	Treatment assignment to firms/establishments
foreach variable in faster1 faster1_rob faster1_direct {

	replace `variable' = 0 if d_traindata == 0
	replace `variable' = 0 if hq_kreis == ao_kreis

	display "*************************************"
	label variable `variable' "Indicator: travel time reduction of at least 30 minutes (`variable')"
	
	bys betnr: egen ever_`variable' = max(`variable')
	label variable ever_`variable' "Indicator: treated establishment (`variable')"

	bys untid jahr: egen firm_`variable' = max(`variable')
	label variable firm_`variable' "Indicator: firm has at least one est. w/ tt. reduction of at least 30 minutes (`variable')"
	
	bys untid: egen firm_ever_`variable' = max(`variable')
	label variable firm_ever_`variable' "Indicator: treated firm (`variable')"
}

* Early treatment:
* FFM - Cologne opened in 2002
qui gen faster_early = (br_diff_0004_min <= -30 & br_diff_0008_min <= -30 & jahr >= 2004 & entryyear < 2002 & d_KRM2004 == 1)
qui replace faster_early = 1 if br_time_mean2001 == . & jahr >= 2004 & entryyear < 2002 & (ao_kreis == 6533 | hq_kreis == 6533) & ao_kreis != hq_kreis

* Late treatment:
* HH-Berlin opened in 2004
* Nuremberg-Ingolstadt opened in 2006
* Berlin-Leipzig opened in 2006
gen faster_mid = (br_diff_0408_min <= -30 & jahr >= 2006 & d_HB2008 == 1 & entryyear < 2004) // Hamburg Berlin
gen faster_late = (br_diff_0408_min <= -30 & jahr >= 2008 & d_Ing2008 == 1 & entryyear < 2006) // Nuremberg Ingolstadt
replace faster_late = 1 if (br_diff_0408_min <= -30 & jahr >= 2008 & d_BL2008 == 1 & entryyear < 2006) // Berlin Leipzig

gen faster_any = (faster_early == 1 | faster_mid == 1 | faster_late == 1)

***	Treatment assignment to firms/establishments
foreach time_period in early mid late any {

	display "*************************************"
	label variable faster_`time_period' "Indicator: travel time reduction of at least 30 minutes (`time_period')"
	
	bys betnr: egen ever_faster_`time_period' = max(faster_`time_period')
	label variable ever_faster_`time_period' "Indicator: treated establishment (`time_period')"

	bys untid jahr: egen firm_faster_`time_period' = max(faster_`time_period')
	label variable firm_faster_`time_period' "Indicator: firm has at least one est. w/ tt. reduction of at least 30 minutes (`variable')"
	
	bys untid: egen firm_ever_faster_`time_period' = max(faster_`time_period')
	label variable firm_ever_faster_`time_period' "Indicator: treated firm (`variable')"
}

*** OTHER VARIABLES

xtset, clear
xtset betnr jahr

gen ln_empl_bet = log(empl_bet)
gen ln_prdt_bet = log(prdt_wkrs_bet)
gen ln_prdt_unt = log(prdt_wkrs_unt)

forvalues t = 2001/2010 {
	gen d_year`t' = (jahr == `t')
}

// county-year FE
gen double aux = ao_kreis * 10000
gen double county_year = aux + jahr
drop aux

// HQ-county-year FE
gen double aux = hq_kreis * 10000
gen double hqcounty_year = aux + jahr
drop aux

//	Sector-year FE
gen double sector_year = br1 * 10000 + jahr

keep if prdt_wkrs_bet > 0

save data/train_analysis_est.dta, replace
erase data/temp_train_est.dta
erase data/train_pre_est.dta

log close
